library(dplyr)
library(ggplot2)
library(DESeq2)
library(tximport)
library(readr)
library(tximportData)
library(readxl)
library(knitr)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(viridis)
library(ggrepel)
library(EnhancedVolcano)
Plasma 1:10
Viridis 1:10
Cividis 1:10
Magma 1:10
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.
##This section of code will generate a summary table of the samples sequenced and their sequencing and alignment metrics
| Sample | patient | reads | %Q30 | Duplication rate | % reads with adapter | STAR alignment number | percent aligned | splices annotated | salmon mapping |
|---|---|---|---|---|---|---|---|---|---|
| Bulk | HTB314 | 71816976 | 94.3 | 57 | 2.6 | 63614504 | 88.58 | 27340654 | 85.6967 |
| Bulk | MDS268 | 31441538 | 93.0 | 24 | 2.3 | 25549835 | 81.26 | 5546270 | 58.2075 |
| Bulk | MDS280 | 28794421 | 93.0 | 24 | 2.3 | 23317205 | 80.98 | 5115546 | 58.8614 |
| CD123+ | HTB314 | 61568500 | 94.0 | 56 | 2.7 | 55359277 | 89.91 | 22831187 | 86.4197 |
| CD123+ | MDS268 | 32307881 | 93.0 | 27 | 3.3 | 26243441 | 81.23 | 7290861 | 64.2150 |
| CD123+ | MDS280 | 23745205 | 93.0 | 34 | 2.6 | 21035358 | 88.59 | 5668653 | 71.3245 |
| CD123- | HTB314 | 83260626 | 94.0 | 58 | 2.6 | 74804722 | 89.84 | 31876605 | 87.6505 |
| CD123- | MDS268 | 27917999 | 93.0 | 26 | 2.7 | 22835189 | 81.79 | 5793322 | 63.9284 |
| CD123- | MDS280 | 24767573 | 93.0 | 32 | 2.6 | 22467937 | 90.72 | 4773823 | 62.7193 |
##This section of code will import and format the data for DeSeq2
#to generate a vector of names and file locations
files<-file.path("~/Desktop/Jordan files/Counts/salmon/salmon/", list.files("~/Desktop/Jordan files/Counts/salmon/salmon/"), "quant.sf")
names(files)<-list.files("~/Desktop/Jordan files/Counts/salmon/salmon")
#to call in a gene_map this was derived by pulling it out of the fasta file with a grep command for ENST and ENSG and pasting them together
gene_map=read_csv("~/Desktop/Jordan files/Counts/salmon/gene_map.csv", col_names=c('enstid', 'ensgid'))
# import transcript level counts
txi.salmon.t<-tximport(files, type="salmon", txOut=TRUE)
txi.salmon.g<-tximport(files=files, type="salmon", tx2gene= gene_map, ignoreTxVersion = TRUE, countsFromAbundance = 'lengthScaledTPM' )
### this code works but if I remove the ignoreTxVersion I get an error, this may have to do with how I am generating my tx2gene file
# Extract counts only
counts <- txi.salmon.g$counts %>%
as.data.frame()
#Extract TPM
tpms <- data.frame(txi.salmon.g$abundance)
##for clients the counts and tpm files should be written out
##This chunk of code sets an expression threshold for TMP where all samples have at least 5 counts
expressed_genes<-tpms %>% filter_all(all_vars(. > 5))
##This chunk of code generates a heatmap using the spearman method as well as a correlation heatmap ## PCA plot
exp.pca <- prcomp(t(log2(expressed_genes)))
exp.pca.summary <- summary(exp.pca)$importance
pc1var = round(exp.pca.summary[3,1] * 100, 1)
pc2var = round(exp.pca.summary[3,2] * 100 - pc1var, 1)
exp.pca.pc <- data.frame(exp.pca$x, sample = colnames(expressed_genes))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## SampleName = col_character(),
## FileName = col_character(),
## patient = col_character(),
## cellType = col_character()
## )
## 3. Run Differential Expression testing using DESeq2 and Calculate Gene Set Enrichment ## Compare 123pos vs 123neg, 123neg vs bulk, and 123pos vs bulk #### sig = padj <0.01 and abs(l2fc) >0.5 ####
coldata<-dplyr::select(metadata, -FileName)
coldata<-column_to_rownames(coldata, 'SampleName')
## need to confirm that all names are in the same order
all(rownames(coldata) %in% colnames(txi.salmon.g$counts))
## [1] TRUE
all(rownames(coldata) == colnames(txi.salmon.g$counts))
## [1] FALSE
coldata<-coldata[colnames(txi.salmon.g$counts),]
all(rownames(coldata) == colnames(txi.salmon.g$counts))
## [1] TRUE
dds <- DESeqDataSetFromTximport(txi.salmon.g,
colData = coldata,
design = ~ patient + cellType)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## using just counts from tximport
### unfiltered data
dds.unfiltered <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res.unfiltered <- results( dds.unfiltered)
##filtered data
dds.filtered<-dds[rowMins(counts(dds))>5,]
dds.filtered<-DESeq(dds.filtered)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res.filtered<-results(dds.filtered)
### need to subset to do pairwise comparison unclear if this keeps the patient design aspect
res_123posvs123neg_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123pos", "123neg"))
res_123posvsBulk_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123pos", "bulk"))
res_123negvsBulk_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123neg", "bulk"))
res_123posvs123neg_filtered<-results(dds.filtered, contrast=c("cellType", "123pos", "123neg"))
res_123posvsBulk_filtered<-results(dds.filtered, contrast=c("cellType", "123pos", "bulk"))
res_123negvsBulk_filtered<-results(dds.filtered, contrast=c("cellType", "123neg", "bulk"))
## look at the numbers of genes meeting threshold the log fold change call is not changing things
sum( res_123posvs123neg_unfiltered$pvalue < 0.01 & abs(res_123posvs123neg_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 1405
sum( res_123posvsBulk_unfiltered$pvalue < 0.01 & abs(res_123posvsBulk_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2382
sum(res_123negvsBulk_unfiltered$pvalue < 0.01 & abs(res_123negvsBulk_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2336
sum( res_123posvs123neg_filtered$pvalue < 0.01 & abs(res_123posvs123neg_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 194
sum( res_123posvsBulk_filtered$pvalue < 0.01 & abs(res_123posvsBulk_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 796
sum(res_123negvsBulk_filtered$pvalue < 0.01 & abs(res_123negvsBulk_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 831
## Warning: Removed 840 rows containing missing values (geom_point).
## Warning: Removed 19307 rows containing missing values (geom_point).
## Warning: Removed 20003 rows containing missing values (geom_point).
## Warning: Removed 280 rows containing missing values (geom_point).
## Warning: Removed 20003 rows containing missing values (geom_point).
## Warning: Removed 280 rows containing missing values (geom_point).
###MA plots
pca_data<-vst(dds, blind=T)
ntop=500
rv <- rowVars(assay(pca_data))
select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
pca <- prcomp(t(assay(pca_data)[select,]))
percentVar <- data.frame(percentVar=pca$sdev^2/sum(pca$sdev^2))%>%
mutate(pc=1:n())%>%dplyr::select(pc, percentVar)
pca_df<-pca$x%>%data.frame()%>%mutate(type=metadata$cellType)
alt_col_values=c("#88CCEE", "#CC6677", "#DDCC77")
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.